01 - Multilayer perceptrons from scratch


In [1]:
import mxnet as mx
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from mxnet import gluon


C:\ProgramData\Anaconda3\lib\site-packages\h5py\__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
C:\ProgramData\Anaconda3\lib\site-packages\h5py\tests\old\test_attrs_data.py:251: DeprecationWarning: invalid escape sequence \H
  s = b"Hello\x00\Hello"
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\func_inspect.py:53: DeprecationWarning: invalid escape sequence \<
  '\<doctest (.*\.rst)\[(.*)\]\>', source_file).groups()
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\_memory_helpers.py:10: DeprecationWarning: invalid escape sequence \s
  cookie_re = re.compile("coding[:=]\s*([-\w.]+)")

Contexts


In [2]:
data_ctx = mx.cpu()
model_ctx = mx.cpu()

MNIST Dataset


In [3]:
num_inputs = 784
num_outputs = 10
batch_size = 64
num_examples = 60000

In [4]:
# MNIST data pre-processing
def transform(data, label):
    return data.astype(np.float32) / 255, label.astype(np.float32)

In [5]:
train_data = gluon.data.DataLoader(gluon.data.vision.MNIST(train=True, transform=transform),
                                   batch_size,
                                   shuffle=True)
test_data = gluon.data.DataLoader(gluon.data.vision.MNIST(train=False, transform=transform),
                                  batch_size,
                                  shuffle=False)

Multilayer perceptron


In [6]:
num_hidden = 256
weight_scale = .01

In [7]:
#######################
#  Allocate parameters for the first hidden layer
#######################
W1 = mx.nd.random_normal(shape=(num_inputs, num_hidden),
                         scale=weight_scale,
                         ctx=model_ctx)
b1 = mx.nd.random_normal(shape=num_hidden,
                         scale=weight_scale,
                         ctx=model_ctx)

#######################
#  Allocate parameters for the second hidden layer
#######################
W2 = mx.nd.random_normal(shape=(num_hidden, num_hidden),
                         scale=weight_scale,
                         ctx=model_ctx)
b2 = mx.nd.random_normal(shape=num_hidden, 
                         scale=weight_scale, 
                         ctx=model_ctx)

#######################
#  Allocate parameters for the output layer
#######################
W3 = mx.nd.random_normal(shape=(num_hidden, num_outputs),
                         scale=weight_scale,
                         ctx=model_ctx)
b3 = mx.nd.random_normal(shape=num_outputs,
                         scale=weight_scale,
                         ctx=model_ctx)

params = [W1, b1, W2, b2, W3, b3]

In [8]:
## Attaching gradients
for param in params:
    param.attach_grad()

Activation functions


In [9]:
# ReLU
def relu(X):
    return mx.nd.maximum(X, mx.nd.zeros_like(X))

In [10]:
# Softmax
def softmax(y_linear):
    exp = mx.nd.exp(y_linear - mx.nd.max(y_linear))
    partition = mx.nd.nansum(data=exp,
                             axis=0,
                             exclude=True).reshape((-1, 1))
    return exp / partition

Cross-entropy


In [11]:
def cross_entropy(yhat, y):
    return - mx.nd.nansum(data=(y * mx.nd.log(yhat)),
                          axis=0,
                          exclude=True)

Softmax Cross-entropy


In [12]:
def softmax_cross_entropy(yhat_linear, y):
    return - mx.nd.nansum(y * mx.nd.log_softmax(yhat_linear),
                          axis=0,
                          exclude=True)

Define the model


In [13]:
def net(X):
    #######################
    #  Compute the first hidden layer
    #######################
    h1_linear = mx.nd.dot(X, W1) + b1
    h1 = relu(h1_linear)

    #######################
    #  Compute the second hidden layer
    #######################
    h2_linear = mx.nd.dot(h1, W2) + b2
    h2 = relu(h2_linear)

    #######################
    #  Compute the output layer.
    #  We will omit the softmax function here
    #  because it will be applied
    #  in the softmax_cross_entropy loss
    #######################
    yhat_linear = mx.nd.dot(h2, W3) + b3
    return yhat_linear

Optimizer


In [14]:
def SGD(params, lr):
    for param in params:
        param[:] = param - lr * param.grad

Evaluate Accuracy


In [15]:
def evaluate_accuracy(data_iterator, net):
    numerator = 0.
    denominator = 0.
    for i, (data, label) in enumerate(data_iterator):
        data = data.as_in_context(model_ctx).reshape((-1, 784))
        label = label.as_in_context(model_ctx)
        output = net(data)
        predictions = mx.nd.argmax(output, axis=1)
        numerator += mx.nd.sum(predictions == label)
        denominator += data.shape[0]
    return (numerator / denominator).asscalar()

Training


In [16]:
epochs = 10
learning_rate = .001
smoothing_constant = .01

In [17]:
for e in tqdm(range(epochs)):
    cumulative_loss = 0
    for i, (data, label) in enumerate(train_data):
        data = data.as_in_context(model_ctx).reshape((-1, 784))
        label = label.as_in_context(model_ctx)
        label_one_hot = mx.nd.one_hot(label, 10)
        with mx.autograd.record():
            output = net(data)
            loss = softmax_cross_entropy(output, label_one_hot)
        loss.backward()
        SGD(params, learning_rate)
        cumulative_loss += mx.nd.sum(loss).asscalar()


    test_accuracy = evaluate_accuracy(test_data, net)
    train_accuracy = evaluate_accuracy(train_data, net)
    print("Epoch %s. Loss: %s, Train_acc %s, Test_acc %s" %
          (e, cumulative_loss/num_examples, train_accuracy, test_accuracy))


  0%|                                                                                                                                                                                      | 0/10 [00:00<?, ?it/s]
Epoch 0. Loss: 1.241500595998764, Train_acc 0.8836167, Test_acc 0.8843
 10%|█████████████████▍                                                                                                                                                            | 1/10 [00:18<02:44, 18.27s/it]
Epoch 1. Loss: 0.33498516895771024, Train_acc 0.9244, Test_acc 0.9256
 20%|██████████████████████████████████▊                                                                                                                                           | 2/10 [00:41<02:45, 20.65s/it]
Epoch 2. Loss: 0.22894775596062342, Train_acc 0.9464833, Test_acc 0.944
 30%|████████████████████████████████████████████████████▏                                                                                                                         | 3/10 [01:05<02:33, 21.91s/it]
Epoch 3. Loss: 0.16448531580368678, Train_acc 0.96243334, Test_acc 0.961
 40%|█████████████████████████████████████████████████████████████████████▌                                                                                                        | 4/10 [01:30<02:15, 22.54s/it]
Epoch 4. Loss: 0.127963012166818, Train_acc 0.9690167, Test_acc 0.9643
 50%|███████████████████████████████████████████████████████████████████████████████████████                                                                                       | 5/10 [01:54<01:54, 22.83s/it]
Epoch 5. Loss: 0.10466508792390426, Train_acc 0.9759, Test_acc 0.9689
 60%|████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                     | 6/10 [02:19<01:32, 23.22s/it]
Epoch 6. Loss: 0.08728616865972678, Train_acc 0.9795833, Test_acc 0.9702
 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                    | 7/10 [02:46<01:11, 23.74s/it]
Epoch 7. Loss: 0.07480254614502191, Train_acc 0.9827333, Test_acc 0.9731
 80%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                  | 8/10 [03:10<00:47, 23.80s/it]
Epoch 8. Loss: 0.06456454908301433, Train_acc 0.9848667, Test_acc 0.9744
 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                 | 9/10 [03:34<00:23, 23.83s/it]
Epoch 9. Loss: 0.05640946247726679, Train_acc 0.98445, Test_acc 0.973
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [03:59<00:00, 23.98s/it]

Using model for prediction


In [18]:
# Define the function to do prediction
def model_predict(net,data):
    output = net(data)
    return mx.nd.argmax(output, axis=1)

In [19]:
samples = 10

In [20]:
# let's sample 10 random data points from the test set
sample_data = gluon.data.DataLoader(dataset=gluon.data.vision.MNIST(train=False, transform=transform),
                                    batch_size = samples, shuffle=True)
for i, (data, label) in enumerate(sample_data):
    data = data.as_in_context(model_ctx)
    im = mx.nd.transpose(data,(1, 0, 2, 3))
    im = mx.nd.reshape(im,(28, 10*28, 1))
    imtiles = mx.nd.tile(im, (1, 1, 3))

    plt.imshow(imtiles.asnumpy())
    plt.show()
    pred=model_predict(net,data.reshape((-1, 784)))
    print('model predictions are:', pred)
    print('true labels :', label)
    break


model predictions are: 
[7. 1. 5. 3. 2. 6. 5. 2. 6. 9.]
<NDArray 10 @cpu(0)>
true labels : 
[7. 1. 5. 3. 2. 6. 5. 2. 6. 9.]
<NDArray 10 @cpu(0)>